libname LOC '\\mead03\browmar$\Kris_Behrens\'; run;
options nofmterr;

*Observations by year, full and in-sample;

%macro frequency(data=, variable=);

	proc freq data = &data;
		tables &variable / norow nocol notot
		out=freq&data&variable;
	run;

%mend frequency;


libname ASM '\\mead03\asm_mead\NAICS\2009 Vintage';

data data1;
	set asm.asm9009_b2;
	counter=1;
run;

*Subset data such that the scope of the file is consistent across years and observations have a latitude and longitude;
**N.B.: Aggegregate and tax records are geocoded to province only and therefore lat and longs are absent;
data data2;
	set data1;
	length NAICS $6. NAICS_NUM 6.;
	NAICS_NUM=INPUT(NAICS, 6.);
	NAICS1 = substr(NAICS,1,1);
	NAICS4 = substr(NAICS,1,4);
	*Subset out longitudinally consistent plants;
	if VST>30000; /*Royce-Miranda rule*/
	if rev_formtype not in ('D01', 'D02', 'D21', 'X02', 'X03', 'Y05', 'Y06', 'R04', 'T02', 'T05'); /*omits aggregate records with no lat/long b/c geog=province only*/
	if totalemp >0;
	if NAICS1 = '3';
	if NAICS not in ('311811', '311830');/*Excludes retail bakeries and tortillia manufacturers for longitudinal consistency*/
	keep MEADENT OECD80 NAICS NAICS_NUM NAICS4 Xlong Ylat totalemp counter yr4 REV_FORMDESC rev_formtype;
run;

%frequency (data=data1, variable=yr4);
%frequency (data=data2, variable=yr4);

data freqdata1yr4;
	set freqdata1yr4;
	full=count;
	keep yr4 full;
run;

data freqdata2yr4;
	set freqdata2yr4;
	insample=count;
	keep yr4 insample;
run;

proc sort data=freqdata1yr4; by yr4; run;
proc sort data=freqdata2yr4; by yr4; run;

data freqsummary;
	merge freqdata1yr4 freqdata2yr4;
	by yr4;
run;

PROC EXPORT DATA= WORK.freqsummary 
            OUTFILE= "Y:\Kris_Behrens\insample.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;


*****************
*				*
*   No weight	*
*				*
****************;

data cdf;
   set loc.cdf_rhs;
   keep year naics NAICSNAM cdf10 cdf25 cdf50 cdf100 plantsgt10 naics10 naics20 kill_vst kill_emp;
run;

*Top 10 most localized;

data cdfa;
	set cdf;
	if plantsgt10=1 and naics10=1; /*restriction: grater than 10 plants in the year and the industry maintains greater than 10 plants for 10+ years*/
run;

proc rank data=cdfa out=cdfrank descending;
	by year;
	var cdf10 cdf25 cdf50 cdf100;
run;

data cdfrank;
	set cdfrank;
	rcdf10=cdf10;
	rcdf25=cdf25;
	rcdf50=cdf50;
	rcdf100=cdf100;
	drop cdf10 cdf25 cdf50 cdf100 naicsnam;
run;

proc sort data=cdfrank; by year naics; run;
proc sort data=cdfa; by year naics; run;

data cdf2;
	merge cdfa cdfrank;
	by year naics;
run;

data cdfr50;
	set cdf2;
	if year=1990 and rcdf50<=10 or year=1999 and rcdf50<=10 or year=2009 and rcdf50<=10 then output cdfr50;
run;
	
PROC EXPORT DATA= WORK.cdfr50 
            OUTFILE= "Y:\Kris_Behrens\cdfr50.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;


*****************
*				*
*   Emp. weight	*
*				*
****************;


data cdf;
   set loc.cdf_emp_rhs;
   keep year naics NAICSNAM cdf10 cdf25 cdf50 cdf100 plantsgt10 naics10 naics20 kill_emp kill_vst;
run;

*Top 10 most localized;

data cdfa;
	set cdf;
	if plantsgt10=1 and naics10=1;/*restriction: grater than 10 plants in the year and the industry maintains greater than 10 plants for 10+ years*/
run;

proc rank data=cdfa out=cdfrank descending;
	by year;
	var cdf10 cdf25 cdf50 cdf100;
run;

data cdfrank;
	set cdfrank;
	rcdf10=cdf10;
	rcdf25=cdf25;
	rcdf50=cdf50;
	rcdf100=cdf100;
	drop cdf10 cdf25 cdf50 cdf100 naicsnam;
run;

proc sort data=cdfrank; by year naics; run;
proc sort data=cdfa; by year naics; run;

data cdf2;
	merge cdfa cdfrank;
	by year naics;
run;

data cdfr50;
	set cdf2;
	if year=1990 and rcdf50<=10 or year=1999 and rcdf50<=10 or year=2009 and rcdf50<=10 then output cdfr50;
run;
	
PROC EXPORT DATA= WORK.cdfr50 
            OUTFILE= "Y:\Kris_Behrens\\cdfr50emp.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;


*****************
*				*
*   VST weight	*
*				*
****************;


data cdf;
   set loc.cdf_vst_rhs;
   keep year naics NAICSNAM cdf10 cdf25 cdf50 cdf100 plantsgt10 naics10 naics20 kill_emp kill_vst;
run;


*Top 10 most localized;

data cdfa;
	set cdf;
	if plantsgt10=1 and naics10=1;/*restriction: grater than 10 plants in the year and the industry maintains greater than 10 plants for 10+ years*/
run;

*Top 10 most localized;

proc rank data=cdfa out=cdfrank descending;
	by year;
	var cdf10 cdf25 cdf50 cdf100;
run;

data cdfrank;
	set cdfrank;
	rcdf10=cdf10;
	rcdf25=cdf25;
	rcdf50=cdf50;
	rcdf100=cdf100;
	drop cdf10 cdf25 cdf50 cdf100 naicsnam;
run;

proc sort data=cdfrank; by year naics; run;
proc sort data=cdfa; by year naics; run;

data cdf2;
	merge cdfa cdfrank;
	by year naics;
run;

data cdfr50;
	set cdf2;
	if year=1990 and rcdf50<=10 or year=1999 and rcdf50<=10 or year=2009 and rcdf50<=10 then output cdfr50;
run;
	
PROC EXPORT DATA= WORK.cdfr50 
            OUTFILE= "Y:\Kris_Behrens\\cdfr50vst.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;


*CDF global stats;

data cdf;
	set loc.cdf_rhs;
run;
proc means data=cdf noprint;
	class year;
	var cdf10 cdf25 cdf50 cdf100 cdf200 cdf500;
	output out=m_cdf mean=;
run;

PROC EXPORT DATA= WORK.m_cdf 
            OUTFILE= "X:\Kris_Behrens\m_cdf.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;


data cdf;
	set loc.cdf_emp_rhs;
run;
proc means data=cdf noprint;
	class year;
	var cdf10 cdf25 cdf50 cdf100 cdf200 cdf500;
	output out=m_cdf mean=;
run;

PROC EXPORT DATA= WORK.m_cdf 
            OUTFILE= "X:\Kris_Behrens\m_cdf_emp.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;


data cdf;
	set loc.cdf_vst_rhs;
run;
proc means data=cdf noprint;
	class year;
	var cdf10 cdf25 cdf50 cdf100 cdf200 cdf500;
	output out=m_cdf mean=;
run;

PROC EXPORT DATA= WORK.m_cdf 
            OUTFILE= "X:\Kris_Behrens\m_cdf_vst.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;

data cdf_oecd;
	set loc.cdf_rhs;
run;

proc means data=cdf noprint;
	class year oecd80;
	var cdf10 cdf25 cdf50 cdf100 cdf200 cdf500;
	output out=m_cdf_oecd mean=;
run;


PROC EXPORT DATA= WORK.m_cdf_oecd 
            OUTFILE= "X:\Kris_Behrens\m_cdf_oecd.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;

*Counts of random, localized and dispersed industries by year;

**No weight;

data tests;
	set loc.loc_nowght;
	random=0;
	localized=0;
	dispersed=0;
	if status=0 then random=1;
	if status=1 then localized=1;
	if status=2 then dispersed=1;
run;

proc means data=tests noprint;
	class year naics;
	var random localized dispersed;
	output out=status mean=;
run;
data status;
	set status;
	if  _TYPE_ = 3;
	drop _TYPE_ _FREQ_;
run;

proc sort data=status; by naics year; run;

proc means data=status noprint;
	class year;
	var random localized dispersed;
	output out=rld sum=;
run;

data rld;
	set rld;
	if  _TYPE_ = 1;
	drop _TYPE_ _FREQ_;
run;
PROC EXPORT DATA= WORK.rld
            OUTFILE= "X:\Kris_Behrens\IO\counts_rld_nowght.xls" 
            DBMS=EXCELCS REPLACE;
     SHEET="plts"; 
RUN;

proc means data=status noprint;
	class naics;
	var random localized dispersed;
	output out=naics_rld sum=;
run;

data naics_rld;
	set naics_rld;
	if _TYPE_ = 1;
	drop _TYPE_ _FREQ_;
run;



proc freq data=naics_rld;
	table localized/ nocol norow nopercent
	out=l_freq;
run;
proc freq data=naics_rld;
	table dispersed/ nocol norow nopercent
	out=d_freq;
run;

proc sort data=status; by naics year; run;

proc transpose data=status out=tstatus
	name=random;
	by naics;
	id year;
run;

**Employment weight;

data tests;
	set loc.loc_empwght;
	random=0;
	localized=0;
	dispersed=0;
	if status=0 then random=1;
	if status=1 then localized=1;
	if status=2 then dispersed=1;
run;

proc means data=tests noprint;
	class year naics;
	var random localized dispersed;
	output out=status mean=;
run;
data status;
	set status;
	if  _TYPE_ = 3;
	drop _TYPE_ _FREQ_;
run;

proc sort data=status; by naics year; run;

proc means data=status noprint;
	class year;
	var random localized dispersed;
	output out=rld sum=;
run;

data rld;
	set rld;
	if  _TYPE_ = 1;
	drop _TYPE_ _FREQ_;
run;
PROC EXPORT DATA= WORK.rld
            OUTFILE= "X:\Kris_Behrens\IO\counts_rld_empwght.xls" 
            DBMS=EXCELCS REPLACE;
     SHEET="emp"; 
RUN;


**Sales weight;

data tests;
	set loc.loc_vstwght;
	random=0;
	localized=0;
	dispersed=0;
	if status=0 then random=1;
	if status=1 then localized=1;
	if status=2 then dispersed=1;
run;

proc means data=tests noprint;
	class year naics;
	var random localized dispersed;
	output out=status mean=;
run;
data status;
	set status;
	if  _TYPE_ = 3;
	drop _TYPE_ _FREQ_;
run;

proc sort data=status; by naics year; run;

proc means data=status noprint;
	class year;
	var random localized dispersed;
	output out=rld sum=;
run;

data rld;
	set rld;
	if  _TYPE_ = 1;
	drop _TYPE_ _FREQ_;
run;
PROC EXPORT DATA= WORK.rld
            OUTFILE= "X:\Kris_Behrens\IO\counts_rld_vstwght.xls" 
            DBMS=EXCELCS REPLACE;
     SHEET="vst"; 
RUN;

*Pi and phi totals by weight;

data counts;
	set loc.loc_nowght;
	keep year naics pi phi;
run;

proc means data=counts noprint;
	class year naics;
	var pi phi;
	output out=piphi_cnt sum=;
run;

data piphi_cnt;
	set piphi_cnt;
	if _TYPE_ = 2;
	drop _TYPE_ _FREQ_ naics;
run;

data employ;
	set loc.loc_empwght;
	keep year naics pi phi;
run;

proc means data=employ noprint;
	class year naics;
	var pi phi;
	output out=piphi_emp sum=;
run;

data piphi_emp;
	set piphi_emp;
	if _TYPE_ = 2;
	pi_emp = pi;
	phi_emp = phi;
	drop _TYPE_ _FREQ_ naics pi phi;
run;

data sales;
	set loc.loc_vstwght;
	keep year naics pi phi;
run;

proc means data=sales noprint;
	class year naics;
	var pi phi;
	output out=piphi_vst sum=;
run;

data piphi_vst;
	set piphi_vst;
	if _TYPE_ = 2;
	pi_vst = pi;
	phi_vst = phi;
	drop _TYPE_ _FREQ_ naics pi phi;
run;

proc sort data=piphi_cnt; by year; run;
proc sort data=piphi_emp; by year; run;
proc sort data=piphi_vst; by year; run;

data piphi_sum;
	merge piphi_cnt piphi_emp piphi_vst;
	by year;
run;

PROC EXPORT DATA= WORK.piphi_sum 
            OUTFILE= "X:\Kris_Behrens\piphi_sum.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;

*Density and local and global confidence intervals;

data naics334410;
	set loc.loc_nowght;
	if naics='334410';
	if year=1990 or year=1999 or year=2009;
run;
PROC EXPORT DATA= WORK.naics334410 
            OUTFILE= "Y:\Kris_Behrens\naics334410.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;

data naics334410;
	set loc.loc_empwght;
	if naics='334410';
	if year=1990 or year=1999 or year=2009;
run;

PROC EXPORT DATA= WORK.naics334410 
            OUTFILE= "Y:\Kris_Behrens\naics334410emp.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;

data naics334410;
	set loc.loc_vstwght;
	if naics='334410';
	if year=1990 or year=1999 or year=2009;
run;

PROC EXPORT DATA= WORK.naics334410 
            OUTFILE= "Y:\Kris_Behrens\naics334410vst.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;

*Description of the distribution of cdf by weight;

data cdf_cnt;
	set loc.cdf_rhs;
run;
proc means data=cdf_cnt noprint;
	class year;
	var cdf50;
	output out=cnt_dist mean=cnt_m p5=cnt5 p10=cnt10 p25=cnt25 p50=cnt50 p75=cnt75 p90=cnt90 p95=cnt95;
run;

data cnt_dist;
	set cnt_dist;
	if _TYPE_=1;
	drop _TYPE_ _FREQ_;
run;

data cdf_emp;
	set loc.cdf_emp_rhs;
run;
proc means data=cdf_emp noprint;
	class year;
	var cdf50;
	output out=emp_dist mean=emp_m p5=emp5 p10=emp10 p25=emp25 p50=emp50 p75=emp75 p90=emp90 p95=emp95;
run;

data emp_dist;
	set emp_dist;
	if _TYPE_=1;
	drop _TYPE_ _FREQ_;
run;

data cdf_vst;
	set loc.cdf_vst_rhs;
run;
proc means data=cdf_vst noprint;
	class year;
	var cdf50;
	output out=vst_dist mean=vst_m p5=vst5 p10=vst10 p25=vst25 p50=vst50 p75=vst75 p90=vst90 p95=vst95;
run;

data vst_dist;
	set vst_dist;
	if _TYPE_=1;
	drop _TYPE_ _FREQ_;
run;


proc sort data=cnt_dist; by year; run;
proc sort data=emp_dist; by year; run;
proc sort data=vst_dist; by year; run;

data all_dist;
	merge cnt_dist emp_dist vst_dist;
	by year;
run;

PROC EXPORT DATA= WORK.all_dist 
            OUTFILE= "X:\Kris_Behrens\cdf_distribution.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;

data cdf_cnt;
	set cdf_cnt;
	cdf50cnt=cdf50;
	keep naics year cdf50cnt;
run;


data cdf_emp;
	set cdf_emp;
	cdf50emp=cdf50;
	keep naics year cdf50emp;
run;


data cdf_vst;
	set cdf_vst;
	cdf50vst=cdf50;
	keep naics year cdf50vst;
run;

proc sort data=cdf_cnt; by naics year; run;
proc sort data=cdf_emp; by naics year; run;
proc sort data=cdf_vst; by naics year; run;

data cdf_all;
	merge cdf_cnt cdf_emp cdf_vst;
	by naics year;
run;

proc sort data=cdf_all; by year naics; run;

proc rank data=cdf_all out=cdf_allrank groups=20;
	by year;
	var cdf50cnt;
	ranks cdf50cntr;
run;

data cdf_allrank;
	set cdf_allrank;
	empcnt=cdf50emp/cdf50cnt;
	vstemp=cdf50vst/cdf50emp;
	vstcnt=cdf50vst/cdf50cnt;
run;

proc means data=cdf_allrank noprint;
	class year cdf50cntr;
	var empcnt vstemp vstcnt;
	output out=rel_cdf50 mean=;
run;


PROC EXPORT DATA= WORK.rel_cdf50 
            OUTFILE= "X:\Kris_Behrens\rel_cdf50.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;


data cdf_cnt;
	set loc.cdf_rhs;
run;

data cdf_emp;
	set loc.cdf_emp_rhs;
run;

data cdf_vst;
	set loc.cdf_vst_rhs;
run;


data cdf_cnt;
	set cdf_cnt;
	cdf10cnt=cdf10;
	keep naics year cdf10cnt;
run;


data cdf_emp;
	set cdf_emp;
	cdf10emp=cdf10;
	keep naics year cdf10emp;
run;


data cdf_vst;
	set cdf_vst;
	cdf10vst=cdf10;
	keep naics year cdf10vst;
run;

proc sort data=cdf_cnt; by naics year; run;
proc sort data=cdf_emp; by naics year; run;
proc sort data=cdf_vst; by naics year; run;

data cdf_all;
	merge cdf_cnt cdf_emp cdf_vst;
	by naics year;
run;

proc sort data=cdf_all; by year naics; run;

proc rank data=cdf_all out=cdf_allrank groups=20;
	by year;
	var cdf10cnt;
	ranks cdf10cntr;
run;

data cdf_allrank;
	set cdf_allrank;
	empcnt=cdf10emp/cdf10cnt;
	vstemp=cdf10vst/cdf10emp;
	vstcnt=cdf10vst/cdf10cnt;
run;

proc means data=cdf_allrank noprint;
	class year cdf10cntr;
	var empcnt vstemp vstcnt;
	output out=rel_cdf10 mean=;
run;


PROC EXPORT DATA= WORK.rel_cdf10 
            OUTFILE= "X:\Kris_Behrens\rel_cdf10.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;



data cdf_cnt;
	set loc.cdf_rhs;
run;

data cdf_emp;
	set loc.cdf_emp_rhs;
run;

data cdf_vst;
	set loc.cdf_vst_rhs;
run;


data cdf_cnt;
	set cdf_cnt;
	cdf200cnt=cdf200;
	keep naics year cdf200cnt;
run;


data cdf_emp;
	set cdf_emp;
	cdf200emp=cdf200;
	keep naics year cdf200emp;
run;


data cdf_vst;
	set cdf_vst;
	cdf200vst=cdf200;
	keep naics year cdf200vst;
run;

proc sort data=cdf_cnt; by naics year; run;
proc sort data=cdf_emp; by naics year; run;
proc sort data=cdf_vst; by naics year; run;

data cdf_all;
	merge cdf_cnt cdf_emp cdf_vst;
	by naics year;
run;

proc sort data=cdf_all; by year naics; run;

proc rank data=cdf_all out=cdf_allrank groups=20;
	by year;
	var cdf200cnt;
	ranks cdf200cntr;
run;

data cdf_allrank;
	set cdf_allrank;
	empcnt=cdf200emp/cdf200cnt;
	vstemp=cdf200vst/cdf200emp;
	vstcnt=cdf200vst/cdf200cnt;
run;

proc means data=cdf_allrank noprint;
	class year cdf200cntr;
	var empcnt vstemp vstcnt;
	output out=rel_cdf200 mean=;
run;


PROC EXPORT DATA= WORK.rel_cdf200 
            OUTFILE= "X:\Kris_Behrens\rel_cdf200.xls" 
            DBMS=EXCEL5 REPLACE;
RUN;

proc means data=loc.cdf_rhs noprint;
	class naics;
	var localised dispersed plantsn;
	output out=infer_cnt sum=;
run;

data cnt_dispersed cnt_localised;
	set infer_cnt;
	plants=plantsn/20;
	if _TYPE_=1 and dispersed>15 then output cnt_dispersed;
	if _TYPE_=1 and localised>15 then output cnt_localised;
run;



proc means data=loc.cdf_vst_rhs noprint;
	class naics;
	var localised dispersed plantsn;
	output out=infer_vst sum=;
run;

data vst_dispersed vst_localised;
	set infer_vst;
	plants=plantsn/20;
	if _TYPE_=1 and dispersed>15 then output vst_dispersed;
	if _TYPE_=1 and localised>15 then output vst_localised;
run;


proc means data=loc.cdf_emp_rhs noprint;
	class naics;
	var localised dispersed plantsn;
	output out=infer_emp sum=;
run;

data emp_dispersed emp_localised;
	set infer_emp;
	plants=plantsn/20;
	if _TYPE_=1 and dispersed>15 then output emp_dispersed;
	if _TYPE_=1 and localised>15 then output emp_localised;
run;


